This is a guided analysis to show how to interpret (CD8 T cell) single-cell RNA-seq data using Seurat and other R packages. We use a public dataset of small size (~1K cells) in order to make fast calculations GEO GSE85947
Load packages and source files (custom functions)
#install.packages("Seurat")
library(Seurat)
## Warning: replacing previous import 'mclust::dmvnorm' by 'mvtnorm::dmvnorm'
## when loading 'fpc'
source("functions.R")
Read expression matrix (\(log2 (TPM+1 )\))
if(!file.exists("input/GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz")){
download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE86028&format=file&file=GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz",destfile = "input/GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz")
}
m <- read.csv(gzfile("input/GSE86028_TILs_sc_wt_mtko.tpm.log2.txt.gz"),row.names=1,sep=" ",header=T)
m[1:10,1:5]
## AH3VMFBGXY_A10_S10 AH3VMFBGXY_A10_S298 AH3VMFBGXY_A11_S203
## 0610005C13Rik 0.0000 0.000 0.0000
## 0610007N19Rik 0.0000 0.000 0.0000
## 0610007P14Rik 0.0000 0.000 0.0000
## 0610008F07Rik 0.0000 0.000 0.0000
## 0610009B14Rik 0.0000 0.000 0.0000
## 0610009B22Rik 0.0000 0.000 0.0000
## 0610009D07Rik 5.0492 6.931 8.4522
## 0610009L18Rik 0.0000 0.000 0.0000
## 0610009O20Rik 0.0000 0.000 0.0000
## 0610010B08Rik 0.0000 0.000 0.0000
## AH3VMFBGXY_A11_S299 AH3VMFBGXY_A12_S12
## 0610005C13Rik 0.0000 0
## 0610007N19Rik 0.0000 0
## 0610007P14Rik 0.0000 0
## 0610008F07Rik 0.0000 0
## 0610009B14Rik 0.0000 0
## 0610009B22Rik 0.0000 0
## 0610009D07Rik 6.0911 0
## 0610009L18Rik 0.0000 0
## 0610009O20Rik 0.0000 0
## 0610010B08Rik 0.0000 0
Read meta data
meta <- read.csv(gzfile("input/GSE86028_TILs_sc_wt_mtko.index.txt.gz"),sep="\t",as.is=F,row.names=1)
meta$mouse_id = factor(meta$mouse_id)
meta$plate_uniq_num = factor(meta$plate_uniq_num)
meta$batch_bio = factor(meta$batch_bio)
head(meta)
## cell_type condition_plate seq_id batch_bio
## AH3VMFBGXY_G10_S370 CD8 MTKO AH3VMFBGXY 2
## AH3VMFBGXY_G7_S367 CD8 MTKO AH3VMFBGXY 2
## AH3VMFBGXY_G6_S366 CD8 MTKO AH3VMFBGXY 2
## AH3VMFBGXY_D2_S326 CD8 MTKO AH3VMFBGXY 2
## AH3VMFBGXY_B10_S22 CD8 WT AH3VMFBGXY 2
## HYGHVBGXX_D12_S144 CD8 WT HYGHVBGXX 2
## mouse_id plate_uniq_num
## AH3VMFBGXY_G10_S370 7 1
## AH3VMFBGXY_G7_S367 7 1
## AH3VMFBGXY_G6_S366 7 1
## AH3VMFBGXY_D2_S326 7 1
## AH3VMFBGXY_B10_S22 3 2
## HYGHVBGXX_D12_S144 3 3
str(meta)
## 'data.frame': 1061 obs. of 6 variables:
## $ cell_type : Factor w/ 1 level "CD8": 1 1 1 1 1 1 1 1 1 1 ...
## $ condition_plate: Factor w/ 2 levels "MTKO","WT": 1 1 1 1 2 2 1 1 2 2 ...
## $ seq_id : Factor w/ 4 levels "AH3VMFBGXY","H3332BGXY",..: 1 1 1 1 1 4 1 1 1 1 ...
## $ batch_bio : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mouse_id : Factor w/ 9 levels "1","2","3","4",..: 7 7 7 7 3 3 8 7 3 3 ...
## $ plate_uniq_num : Factor w/ 16 levels "1","2","3","4",..: 1 1 1 1 2 3 4 1 2 2 ...
Create Seurat object
data.seurat <- CreateSeuratObject(m, meta.data = meta, project = "tumorTILS", min.cells = 3, min.genes = 500, is.expr = 0,do.scale=F, do.center=F)
rm(m,meta)
Explore samples
table(data.seurat@meta.data$mouse_id)
##
## 1 2 3 4 5 6 7 8 9
## 130 158 108 138 78 80 133 106 130
summary(data.seurat@meta.data$nGene)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1532 2239 2891 3079 3787 5960
summary(data.seurat@meta.data$nUMI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8959 13733 17073 17835 21256 32145
Caclulate ribosomal content (might indicate techincal variability) (same for mitochondrial content)
ribo.genes <- grep(pattern = "^Rp[ls]", x = rownames(x = data.seurat@data), value = TRUE)
percent.ribo <- Matrix::colSums(data.seurat@raw.data[ribo.genes, ])/Matrix::colSums(data.seurat@raw.data)
summary(percent.ribo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02911 0.04271 0.05150 0.05308 0.06220 0.09850
Add new ‘metadata’ to our Seurat object
data.seurat <- AddMetaData(object = data.seurat, metadata = percent.ribo, col.name = "percent.ribo")
print(paste("matrix dim",dim(data.seurat@data)))
## [1] "matrix dim 14314" "matrix dim 1061"
In this dataset ‘high quality’ cells were already selected, however we might want to filter further based on these parameters
hist(data.seurat@meta.data$nGene)
hist(data.seurat@meta.data$nUMI)
hist(data.seurat@meta.data$percent.ribo)
Note that Seurat automatically calculates ‘nUMI’ per cell, assuming @data matrix contains UMI counts. In this case however, ‘nUMI’ contains the sum of the normalized expression values (log2 TPM) for each cell
Look at distributions per sample
data.seurat <- SetAllIdent(object = data.seurat, id = "mouse_id")
VlnPlot(object = data.seurat, features.plot = c("nGene", "nUMI","percent.ribo"), nCol = 2,point.size.use=0.001)
print(paste("matrix dim",dim(data.seurat@data)))
## [1] "matrix dim 14314" "matrix dim 1061"
data.seurat <- FilterCells(object = data.seurat, subset.names = c("nGene", "nUMI","percent.ribo"),
low.thresholds = c(1500, 8000, -Inf), high.thresholds = c(6000, 30000,0.1))
print(paste("matrix dim",dim(data.seurat@data)))
## [1] "matrix dim 14314" "matrix dim 1056"
There are two groups of mice, Wildtype (WT) and MT KO
table(data.seurat@meta.data$condition_plate)
##
## MTKO WT
## 527 529
Will analyze only cells from WT mice
data.seurat <- SubsetData(data.seurat,cells.use = data.seurat@meta.data$condition_plate=="WT")
table(data.seurat@meta.data$mouse_id)
##
## 1 2 3 4 5 6 7 8 9
## 130 158 108 133 0 0 0 0 0
Extract highly variable genes
set.seed(1234)
data.seurat <- FindVariableGenes(
object = data.seurat,
mean.function = ExpMean,
dispersion.function = LogVMR,
x.low.cutoff = 1,
x.high.cutoff = 15,
y.cutoff = 1
)
length(data.seurat@var.genes)
## [1] 1616
head(row.names(data.seurat@hvg.info)[order(data.seurat@hvg.info$gene.mean,decreasing = T)],n=20)
Check expression values of a few genes
VlnPlot(data.seurat,features.plot = qq(Gapdh,Actg1,Actb,Ptprc,Cd2,Cd3g,Cd8a,Cd8b1), point.size.use = exp(-3), x.lab.rot=T)
Some variation but no evident systematic biases
Looking at expression values
plot(density(as.numeric(data.seurat@data["Gzmb",])))
Scale data (useful for example to do PCA on scaled variables, to make heatmaps, etc)
data.seurat <- ScaleData(
object = data.seurat, do.scale=T, do.center = T
)
## NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## Scaling data matrix
plot(density(as.numeric(data.seurat@scale.data["Gzmb",])))
PCA on highly variable genes
set.seed(1234)
data.seurat <- RunPCA(object = data.seurat, pc.genes = data.seurat@var.genes, do.print = TRUE, pcs.print = 1:5,
genes.print = 10)
## [1] "PC1"
## [1] "Pde2a" "Pydc3" "Fam65b" "Gramd3" "Itgb7" "Sell" "Zfp36l2"
## [8] "Cd7" "Cxcr3" "Pik3ip1"
## [1] ""
## [1] "Cks1b" "Tpi1" "Birc5" "Ube2c" "Txn1"
## [6] "Rrm2" "Ccnb2" "Hist1h2ao" "Cst3" "Cd74"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "Nkg7" "Lag3" "Gzmb" "Prf1" "Ctla4" "Ccl3" "Ccnb2"
## [8] "Hilpda" "Ifng" "Impa2"
## [1] ""
## [1] "Ifi205" "Fgd2" "Csf2rb" "Ly86" "Rnase6" "Aif1" "Tbc1d8"
## [8] "Plbd1" "Cd24a" "Ctsh"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "Ccr7" "Sell" "Xcl1" "Dapl1" "Lad1" "Gpr18" "Rgcc" "Cd69"
## [9] "Lta" "Fos"
## [1] ""
## [1] "Gzmb" "Id2" "Ccl4" "Lgals3" "Prf1" "Ccl3"
## [7] "Nkg7" "AW112010" "Actg1" "Hilpda"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "Rrm2" "Spc24" "Hist2h3c2" "Hist2h3b" "Birc5"
## [6] "Kif22" "Aurka" "Fam83d" "Pkmyt1" "Hist1h2ao"
## [1] ""
## [1] "Cd70" "Cd83" "Xcl1" "Ccl1" "Lad1" "Cd81" "Plk2"
## [8] "Ccr8" "Slc2a6" "Lta"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "Laptm4b" "Ccl22" "Oasl1" "Fscn1" "Ptafr" "Gcnt2" "Ebi3"
## [8] "Cacnb3" "Mreg" "Isg15"
## [1] ""
## [1] "Apoe" "C1qa" "Lyz2" "C1qc" "C1qb" "Fcgr4"
## [7] "Plbd1" "Cd300lf" "Selenbp1" "Emr1"
## [1] ""
## [1] ""
We explore the genes with highers loading to have an idea of which ones are driving global variation (e.g. important T cell genes as expected, such as Sell, Lag3, Gzmb, Prf1; cell cycle genes such as Ccnb2, Cks1b, etc )
PCElbowPlot(object = data.seurat)
###
###
Check distribution per sample (batch effects?)
data.seurat <- SetAllIdent(object = data.seurat, id = "mouse_id")
PCAPlot(object = data.seurat)
Check distribution of important genes for our subset
FeaturePlot(data.seurat,features.plot=c("Ptprc","Cd2","Cd8a","Cd8b1","Cd4"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
## $Ptprc
##
## $Cd2
##
## $Cd8a
##
## $Cd8b1
##
## $Cd4
Cell cycle is typically a strong source of transcriptomic variation
FeaturePlot(data.seurat,features.plot=c("Mki67","Ccna2","Cks1b"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
## $Mki67
##
## $Ccna2
##
## $Cks1b
Number of detected genes/reads count per cell associated to amount af mRNA: might be associated to biology (eg cell size, transcriptional activity), cell quality, technical effects, etc
FeaturePlot(data.seurat,features.plot=c("nUMI","nGene","percent.ribo"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
## $nUMI
##
## $nGene
##
## $percent.ribo
There are some outliers. Not interested (unless looking for rare cell states)
Spot genes among top genes contributing to PCs which might be associated to outliers
FeaturePlot(data.seurat,features.plot=c("H2-Eb1","Ctsh","H2-Aa","H2-Ab1","Csf2rb"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
## $`H2-Eb1`
##
## $Ctsh
##
## $`H2-Aa`
##
## $`H2-Ab1`
##
## $Csf2rb
Indeed, these outliers express high levels of MHC II genes, Csf2rb, etc. suggesting these might be contaminating myeloid/APCs
Filter out outliers based on expression (or lack of) of a few genes (you might try multivariate filtering criteria as well)
data.seurat <- FilterCells(object = data.seurat, subset.names = c("Cd2", "Cd8a","Cd8b1","Cd4","Ctsh"),
low.thresholds = c(1, 1, 1,-Inf,-Inf), high.thresholds = c(Inf,Inf,Inf,exp(-10),exp(-10)))
print(paste("matrix dim",dim(data.seurat@data)))
## [1] "matrix dim 14314" "matrix dim 398"
FeaturePlot(data.seurat,features.plot=c("Ptprc","Cd2","Cd8a","Cd8b1","H2-Aa","Csf2rb"),reduction.use="pca",cols.use = c("lightgrey",
"blue"),nCol=2,no.legend = F,do.return=T)
## $Ptprc
##
## $Cd2
##
## $Cd8a
##
## $Cd8b1
##
## $`H2-Aa`
##
## $Csf2rb
Now it looks better
Also there is a strong effect from cell cycle genes, we might want to remove them from our list of highly variable genes used for dimensionality
Get list of cell cycle genes (map to Ensembl IDs)
#BiocManager::install("org.Mm.eg.db",type="source")
#BiocManager::install("clusterProfiler")
#BiocManager::install("DO.db",type="source")
#BiocManager::install("GO.db",type="source")
#signatureList <- readRDS("input/signatureList.rds")
library(org.Mm.eg.db)
library(clusterProfiler)
idMap <- bitr( rownames(data.seurat@data) , fromType = "SYMBOL", toType = c("ENTREZID"), OrgDb = org.Mm.eg.db)
## Warning in bitr(rownames(data.seurat@data), fromType = "SYMBOL", toType =
## c("ENTREZID"), : 5.71% of input gene IDs are fail to map...
cellCycle <- mget(c("GO:0007049"),org.Mm.egGO2ALLEGS)
cellCycle <- unique(unlist(cellCycle))
cellCycle.symbol <- unique(na.omit(idMap$SYMBOL[match(cellCycle,idMap$ENTREZID)]))
length(cellCycle.symbol)
## [1] 1293
print(length(data.seurat@var.genes))
## [1] 1616
data.seurat@var.genes <- setdiff(data.seurat@var.genes,cellCycle.symbol)
print(length(data.seurat@var.genes))
## [1] 1494
PCA
data.seurat <- ScaleData(object = data.seurat, do.scale=T, do.center = T) #since we filtered out cells, scaling might have slightly changed
## NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## Scaling data matrix
set.seed(1234)
data.seurat <- RunPCA(object = data.seurat, pc.genes = data.seurat@var.genes, do.print = TRUE, pcs.print = 1:5,
genes.print = 10)
## [1] "PC1"
## [1] "Pydc3" "Pde2a" "Itgb7" "Fam65b" "Sell" "Cxcr3" "Gramd3"
## [8] "Cd7" "Pik3ip1" "Igfbp4"
## [1] ""
## [1] "Tpi1" "Lag3" "Impa2" "Txn1" "Gapdh"
## [6] "Bcl2a1d" "Rrm2" "Hist1h2ao" "Gzmc" "Mt1"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "Ccr7" "Sell" "Xcl1" "Lad1" "Dapl1" "Cd83" "Slc2a6"
## [8] "Fos" "Cd69" "Cd81"
## [1] ""
## [1] "Gzmb" "Ccl4" "Nkg7" "Prf1" "Lgals3"
## [6] "Actg1" "Ccl3" "AW112010" "AA467197" "Serpina3g"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "Ly6c2" "Rrm2" "Hist2h3c2" "Hist2h3b" "Sell"
## [6] "Nrm" "Tgtp1" "Pde2a" "Mef2d" "Hist1h2ao"
## [1] ""
## [1] "Cd83" "Cd81" "Cd70" "Ccr8" "Ccl1" "Xcl1" "Lag3"
## [8] "Bcl2a1d" "Slc2a6" "Idi2"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "Rrp9" "Utf1" "Apoe" "2410002F23Rik"
## [5] "Atf3" "Prdx4" "H2-Eb1" "Rrm2"
## [9] "Rps4y2" "Hsph1"
## [1] ""
## [1] "Lars2" "Gm16894" "Zc3h12a" "Lgals9" "Mcoln1" "Unc93b1" "Foxo3"
## [8] "Kdelc2" "Irf7" "Igflr1"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "Gzmd" "Gzmg" "Gzme" "S1pr5" "Gzmf"
## [6] "Klrg1" "Gzma" "Gzmc" "Dtx1" "Serpinb9b"
## [1] ""
## [1] "Hist2h3c2" "Hist2h3b" "Ctla4" "Ifi27l2a" "Rtp4"
## [6] "Rrm2" "Hist1h2ao" "AW112010" "Ifit3" "Cxcl10"
## [1] ""
## [1] ""
PCElbowPlot(object = data.seurat)
ndim=10
tSNE
data.seurat <- RunTSNE(object = data.seurat, dims.use = 1:ndim, do.fast = F, seed.use=123, perplexity=30)
#pdf("out/tsne_Markers.pdf",width=10,height=10)
myMarkers <- qq(Sell,Il7r,Ccr7,Tcf7,Gzmb,Fasl,Prf1,Pdcd1,Lag3,Tigit,Havcr2,Entpd1,Mki67,Ccna2)
FeaturePlot(data.seurat,features.plot=myMarkers,reduction.use="tsne",cols.use = c("lightgrey", "blue"),nCol=3,no.legend = F,do.return=T)
## $Sell
##
## $Il7r
##
## $Ccr7
##
## $Tcf7
##
## $Gzmb
##
## $Fasl
##
## $Prf1
##
## $Pdcd1
##
## $Lag3
##
## $Tigit
##
## $Havcr2
##
## $Entpd1
##
## $Mki67
##
## $Ccna2
#dev.off()
Visually explore heterogenity of relevant genes
Clustering algorithm implemented in Seruat: shared nearest neighbor (SNN) modularity optimization based clustering algorithm. Important parameters: resolution, dims.use, k.param
set.seed(12345)
data.seurat <- FindClusters(object = data.seurat, reduction.type = "pca", dims.use = 1:ndim,
resolution = 0.5, print.output = 0, save.SNN = TRUE, force.recalc = T, k.param = 10)
data.seurat@meta.data$cluster <- factor(data.seurat@meta.data$res.0.5)
Define cluster colors
clusterColors <- gg_color_hue(length(levels(data.seurat@meta.data$cluster)))
names(clusterColors) <- levels(data.seurat@meta.data$cluster)
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cluster", colors.use = clusterColors)
Hierarchical clustering (might want to try dynamicTreeCut as well)
set.seed(1234)
fit.hclust=hclust(dist(data.seurat@dr$pca@cell.embeddings[,1:ndim]),method="ward.D2")
nclus <- 6
clusters=factor(cutree(fit.hclust,k=nclus))
data.seurat@ident <- clusters
data.seurat <- StashIdent(object = data.seurat, save.name = "wardClustering")
data.seurat <- SetAllIdent(object = data.seurat, id = "wardClustering")
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "wardClustering")
plot1 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cluster")
plot2 <- TSNEPlot(object = data.seurat, do.return = TRUE, group.by = "wardClustering",
no.legend = TRUE, do.label = TRUE)
plot_grid(plot1, plot2)
Good correspondence ###
###
K-Means
set.seed(1234)
nclus=6
clusters.kmeans <- kmeans(data.seurat@dr$pca@cell.embeddings[,1:ndim],nclus,nstart = 10)
data.seurat@ident <- factor(clusters.kmeans$cluster)
data.seurat <- StashIdent(object = data.seurat, save.name = "kmeansClustering")
TSNEPlot(object = data.seurat, do.return = TRUE, group.by = "kmeansClustering",
no.legend = TRUE, do.label = TRUE)
#ggsave("out/tSNE.0.pdf",width = 3,height = 3)
Also with KMeans
table(data.seurat@meta.data$wardClustering,data.seurat@meta.data$cluster)
# Rand index for cluster agreement
#install.packages("mclust")
library(mclust)
adjustedRandIndex(data.seurat@meta.data$wardClustering, data.seurat@meta.data$cluster)
adjustedRandIndex(data.seurat@meta.data$kmeans, data.seurat@meta.data$cluster)
adjustedRandIndex(data.seurat@meta.data$kmeans, data.seurat@meta.data$wardClustering)
#SNN clustering has more agreement with the other two
Checking co-expression of important genes
p1 <- plot2geneSeuratTSNE(data.seurat,"Pdcd1","Tcf7")
p2 <- plot2geneSeuratTSNE(data.seurat,"Gzmb","Ccr7")
p3 <- plot2geneSeuratTSNE(data.seurat,"Lag3","Sell")
p4 <- plot2geneSeuratTSNE(data.seurat,"Havcr2","Entpd1")
plot_grid(p1,p2,p3,p4)
#ggsave("out/tSNE.markers.pdf",width = 10,height=8)
plot2geneSeuratTSNE(data.seurat,"Pdcd1","Tcf7")
plot2geneSeuratTSNE(data.seurat,"Mki67","Ccna2")
###
###
Sample distribution (biological variablity +batch effect)
for (s in levels(factor(data.seurat@meta.data$mouse_id))) {
mycol <- data.seurat@meta.data$mouse_id==s
print(ggplot(data=data.frame(data.seurat@dr$tsne@cell.embeddings), aes(x=tSNE_1,y=tSNE_2, color="gray")) +
geom_point(color="gray",alpha=0.5) +
geom_point(data=data.frame(data.seurat@dr$tsne@cell.embeddings[mycol,]),alpha=0.7, color="blue") +
theme_bw() + theme(aspect.ratio = 1, legend.position="right") + xlab("TSNE 1") + ylab("TSNE 2") + ggtitle(s))
}
classification of CD8 TIL states using TILPRED
#install.packages("BiocManager")
#install.packages("doParallel")
#install.packages("doRNG")
#BiocManager::install("GenomeInfoDbData",type="source")
#BiocManager::install("AUCell")
#BiocManager::install("SingleCellExperiment",type="source")
#install.packages("remotes")
#remotes::install_github("carmonalab/TILPRED")
library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:clusterProfiler':
##
## simplify
## The following objects are masked from 'package:base':
##
## aperm, apply
library(AUCell)
library(TILPRED)
data.sce <- Convert(data.seurat,to="sce")
data.sce.pred <- predictTilState(data.sce)
## Warning in predictTilState(data.sce): The following genes were not found
## Gm10282,Cks1brt,Pcna-ps2 . Unknown prediction performance.
## Genes in the gene sets NOT available in the dataset:
## NaiveVsExhausted_down: 1 (1% of 97)
## cycling: 2 (2% of 94)
table(data.sce.pred$predictedState)
##
## Naive Effector MemoryLike Exhausted unknown
## 46 90 68 184 10
str(data.sce.pred)
## List of 3
## $ predictedState : Factor w/ 5 levels "Naive","Effector",..: 5 4 4 4 4 3 2 4 4 4 ...
## ..- attr(*, "names")= chr [1:398] "AH3VMFBGXY_A10_S10" "AH3VMFBGXY_A4_S4" "AH3VMFBGXY_A6_S6" "AH3VMFBGXY_A7_S7" ...
## $ stateProbabilityMatrix: num [1:398, 1:4] 1.61e-02 2.57e-03 1.00e-06 1.38e-05 5.46e-05 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:398] "AH3VMFBGXY_A10_S10" "AH3VMFBGXY_A4_S4" "AH3VMFBGXY_A6_S6" "AH3VMFBGXY_A7_S7" ...
## .. ..$ : chr [1:4] "Naive" "Effector" "MemoryLike" "Exhausted"
## $ cycling : Named logi [1:398] FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..- attr(*, "names")= chr [1:398] "AH3VMFBGXY_A10_S10" "AH3VMFBGXY_A4_S4" "AH3VMFBGXY_A6_S6" "AH3VMFBGXY_A7_S7" ...
head(data.sce.pred$stateProbabilityMatrix)
## Naive Effector MemoryLike Exhausted
## AH3VMFBGXY_A10_S10 1.608488e-02 2.974940e-01 0.359993363 0.3264278
## AH3VMFBGXY_A4_S4 2.567036e-03 4.726579e-01 0.019469265 0.5053058
## AH3VMFBGXY_A6_S6 1.003015e-06 1.676011e-06 0.001023729 0.9989736
## AH3VMFBGXY_A7_S7 1.378348e-05 1.564267e-04 0.004174986 0.9956548
## AH3VMFBGXY_A9_S9 5.460452e-05 9.523511e-04 0.009209985 0.9897831
## AH3VMFBGXY_B11_S23 2.596244e-04 8.337796e-04 0.817061148 0.1818454
data.seurat <- AddMetaData(object = data.seurat, metadata = data.sce.pred$predictedState, col.name = "state.pred")
data.seurat <- AddMetaData(object = data.seurat, metadata = data.sce.pred$cycling, col.name = "cycling")
stateColorsPred <- c("#F8766D","#A58AFF","#00B6EB","#53B400","#000000")
names(stateColorsPred) <- qq(Naive,Effector,MemoryLike,Exhausted,NP)
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "state.pred", colors.use =stateColorsPred)
TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cycling")
plot1 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "state.pred", colors.use =stateColorsPred)
plot2 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cycling")
plot_grid(plot1, plot2)
plot1 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "state.pred", colors.use =stateColorsPred)
plot2 <- TSNEPlot(object = data.seurat, do.return = TRUE, no.legend = TRUE, do.label = TRUE, group.by = "cluster")
plot_grid(plot1, plot2)
Explore association of clusters and predicted states
library(pheatmap)
library(RColorBrewer)
a <- table(data.seurat@meta.data$state.pred,data.seurat@meta.data$cluster)
a.scaled <- scale(a,scale = colSums(a),center=F)*100
a.scaled
##
## 0 1 2 3 4
## Naive 0.000000 3.191489 0.000000 82.692308 0.000000
## Effector 1.709402 89.361702 0.000000 5.769231 2.083333
## MemoryLike 6.837607 1.063830 21.739130 3.846154 87.500000
## Exhausted 88.034188 6.382979 75.362319 0.000000 10.416667
## unknown 3.418803 0.000000 2.898551 7.692308 0.000000
##
## 5
## Naive 0.000000
## Effector 0.000000
## MemoryLike 0.000000
## Exhausted 100.000000
## unknown 0.000000
pheatmap(round(a.scaled),cexCol = 0.8,cluster_cols = F)
Cycling cells per cluster
barplot(tapply(data.seurat@meta.data$cycling,data.seurat@meta.data$cluster,mean))
#BiocManager::install("MAST")
data.seurat <- SetAllIdent(object = data.seurat, id = "cluster")
cd8.markers <- FindAllMarkers(object = data.seurat, only.pos = TRUE, min.pct = 0.1, min.diff.pct=0.1, logfc.threshold = 0.25, test.use = "bimod",max.cells.per.ident = Inf)#Try MAST
cd8.markers.list <- split(cd8.markers,cd8.markers$cluster)
cd8.markers.list <- lapply((cd8.markers.list),function(x) x$gene)
lapply(cd8.markers.list,head)
## $`0`
## [1] "Tigit" "Havcr2" "Lag3" "Id2" "Cxcr6" "Prf1"
##
## $`1`
## [1] "Ccl5" "Alox8" "Gvin1" "Faim" "Mettl23" "Ccdc124"
##
## $`2`
## [1] "2810417H13Rik" "Gmnn" "Mcm5" "Hmmr"
## [5] "Cdc45" "Brip1"
##
## $`3`
## [1] "Sell" "Cnr2" "Rapgef4" "Tcf7" "Satb1" "Lef1"
##
## $`4`
## [1] "Ajuba" "Zfp202" "Cd83" "Ifi27l2a" "Ccr7" "Xcl1"
##
## $`5`
## [1] "Lix1l" "Gzmf" "Gzmg" "Gzmd" "Agphd1" "Gzmc"
set.seed(1234)
geneSample <- unique(unlist(lapply(cd8.markers.list,function(x) { head(x,n=10) })))
clusterList <- split(row.names(data.seurat@meta.data),data.seurat@meta.data$cluster)
cellSample <- unlist(lapply(clusterList,function(x) { head(x,n=25) }))
ann_col <- data.frame(cluster = data.seurat@meta.data$cluster)
row.names(ann_col) <- row.names(data.seurat@meta.data)
ann_colors <- list(cluster = clusterColors)
pheatmap(data.seurat@data[geneSample,cellSample], cluster_rows = T, cluster_cols = F, labels_col = "",annotation_col = ann_col[cellSample,,drop=F],scale = "row", clustering_distance_rows="correlation", annotation_colors = ann_colors)
myMarkers <- qq(Sell,Il7r,Lef1,S1pr1,Ccr7,Tcf7,Cd44,Ccl5,Gzmk,Cxcr3,Gzmb,Fasl,Prf1,Tox,Batf,Pdcd1,Lag3,Tigit,Havcr2,Entpd1,Mki67,Ccna2)
data.seurat <- SetAllIdent(object = data.seurat, id = "cluster")
DotPlot(data.seurat,genes.plot = myMarkers,x.lab.rot = 1,plot.legend =T,cols.use = "RdYlGn")
#ggsave("out/DotPlot.pdf",width = 10, height = 3)
Define a few example gene signatures
signatureList <- list()
signatureList$Cytotoxicity <- c("Prf1","Fasl","Gzmb")
signatureList$Stemness <- c("Tcf7","Sell","Il7r","Lef1")
signatureList$InhibitoryReceptors <- c("Pdcd1","Havcr2","Tigit","Lag3","Ctla4")
signatureList$cellCycleGenes <- cellCycle.symbol
Use AUCell package to calculate signature enrichment
#BiocManager::install("AUCell")
library(AUCell)
set.seed(1234)
cells_rankings <- AUCell::AUCell_buildRankings(as.matrix(data.seurat@data), nCores=2, plotStats=TRUE)
## Quantiles for the number of genes detected by cell:
## (Non-detected genes are shuffled at the end of the ranking. Keep it in mind when choosing the threshold for calculating the AUC).
## min 1% 5% 10% 50% 100%
## 1576.00 1595.94 1719.00 1885.80 2901.50 5786.00
## Using 2 cores.
cells_AUC <- AUCell_calcAUC(signatureList, cells_rankings, aucMaxRank=1500)
print(cells_AUC)
## AUC for 4 gene sets (rows) and 398 cells (columns).
##
## Top-left corner of the AUC matrix:
## cells
## gene sets AH3VMFBGXY_A10_S10 AH3VMFBGXY_A4_S4 AH3VMFBGXY_A6_S6
## Cytotoxicity 0.44570538 0.9178905 0.7725857
## Stemness 0.14173623 0.0000000 0.0000000
## InhibitoryReceptors 0.63233133 0.7472278 0.7910488
## cellCycleGenes 0.09842882 0.1018561 0.1251876
## cells
## gene sets AH3VMFBGXY_A7_S7 AH3VMFBGXY_A9_S9
## Cytotoxicity 0.7405429 0.6030263
## Stemness 0.0000000 0.0000000
## InhibitoryReceptors 0.6120240 0.6452906
## cellCycleGenes 0.1054429 0.1094186
data.seurat@meta.data <- data.seurat@meta.data[!grepl("^AUC",colnames(data.seurat@meta.data))]
data.seurat <- AddMetaData(data.seurat, metadata = t(assays(cells_AUC)$AUC), col.name = paste0("AUC_",rownames(cells_AUC)))
for (s in names(signatureList)) {
print(myFeaturePlotAUC(data.seurat,s,cells_AUC))
}
for (s in names(signatureList)) {
print(VlnPlot(object = data.seurat, features.plot = paste0("AUC_",s), point.size.use=0.001, cols.use=clusterColors, group.by = "cluster"))
}
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] doRNG_1.7.1 rngtools_1.3.1
## [3] pkgmaker_0.27 registry_0.5
## [5] foreach_1.4.4 RColorBrewer_1.1-2
## [7] pheatmap_1.0.12 TILPRED_0.1.0
## [9] AUCell_1.4.1 SingleCellExperiment_1.4.1
## [11] SummarizedExperiment_1.12.0 DelayedArray_0.8.0
## [13] BiocParallel_1.16.5 matrixStats_0.54.0
## [15] GenomicRanges_1.34.0 GenomeInfoDb_1.18.1
## [17] clusterProfiler_3.10.1 org.Mm.eg.db_3.7.0
## [19] AnnotationDbi_1.44.0 IRanges_2.16.0
## [21] S4Vectors_0.20.1 Biobase_2.42.0
## [23] BiocGenerics_0.28.0 tidyr_1.0.0
## [25] dplyr_0.8.3 Seurat_2.3.4
## [27] Matrix_1.2-17 cowplot_0.9.4
## [29] ggplot2_3.1.0
##
## loaded via a namespace (and not attached):
## [1] reticulate_1.10 R.utils_2.7.0 tidyselect_0.2.5
## [4] RSQLite_2.1.1 htmlwidgets_1.3 grid_3.5.2
## [7] trimcluster_0.1-2.1 Rtsne_0.15 munsell_0.5.0
## [10] codetools_0.2-16 ica_1.0-2 units_0.6-2
## [13] withr_2.1.2 colorspace_1.4-0 GOSemSim_2.8.0
## [16] knitr_1.20 rstudioapi_0.9.0 ROCR_1.0-7
## [19] robustbase_0.93-5 dtw_1.20-1 DOSE_3.8.2
## [22] gbRd_0.4-11 Rdpack_0.10-1 labeling_0.3
## [25] lars_1.2 urltools_1.7.1 GenomeInfoDbData_1.2.0
## [28] bit64_0.9-7 farver_1.1.0 vctrs_0.2.0
## [31] doParallel_1.0.14 diptest_0.75-7 R6_2.3.0
## [34] hdf5r_1.2.0 flexmix_2.3-14 bitops_1.0-6
## [37] fgsea_1.8.0 gridGraphics_0.3-0 assertthat_0.2.0
## [40] promises_1.0.1 SDMTools_1.1-221 scales_1.0.0
## [43] ggraph_1.0.2 nnet_7.3-12 enrichplot_1.2.0
## [46] gtable_0.2.0 npsurv_0.4-0 rlang_0.4.0
## [49] zeallot_0.1.0 splines_3.5.2 lazyeval_0.2.1
## [52] acepack_1.4.1 europepmc_0.3 checkmate_1.9.1
## [55] yaml_2.2.0 reshape2_1.4.3 backports_1.1.3
## [58] httpuv_1.4.5.1 qvalue_2.14.1 Hmisc_4.2-0
## [61] tools_3.5.2 ggplotify_0.0.3 ellipsis_0.2.0.1
## [64] gplots_3.0.1 proxy_0.4-22 ggridges_0.5.1
## [67] Rcpp_1.0.2 plyr_1.8.4 base64enc_0.1-3
## [70] progress_1.2.0 zlibbioc_1.28.0 purrr_0.3.0
## [73] RCurl_1.95-4.11 prettyunits_1.0.2 rpart_4.1-13
## [76] pbapply_1.3-4 viridis_0.5.1 zoo_1.8-4
## [79] ggrepel_0.8.0 cluster_2.0.7-1 magrittr_1.5
## [82] data.table_1.12.2 DO.db_2.9 lmtest_0.9-36
## [85] triebeard_0.3.0 RANN_2.6.1 mvtnorm_1.0-11
## [88] packrat_0.5.0 fitdistrplus_1.0-14 mime_0.6
## [91] xtable_1.8-3 hms_0.4.2 lsei_1.2-0
## [94] evaluate_0.12 XML_3.98-1.16 mclust_5.4.5
## [97] gridExtra_2.3 compiler_3.5.2 tibble_2.1.3
## [100] KernSmooth_2.23-15 crayon_1.3.4 R.oo_1.22.0
## [103] htmltools_0.3.6 later_0.7.5 segmented_0.5-3.0
## [106] Formula_1.2-3 snow_0.4-3 DBI_1.0.0
## [109] tweenr_1.0.1 MASS_7.3-51.1 fpc_2.1-11.1
## [112] R.methodsS3_1.7.1 gdata_2.18.0 metap_1.0
## [115] igraph_1.2.2 pkgconfig_2.0.2 rvcheck_0.1.3
## [118] foreign_0.8-71 xml2_1.2.2 annotate_1.60.0
## [121] XVector_0.22.0 bibtex_0.4.2 stringr_1.3.1
## [124] digest_0.6.18 tsne_0.1-3 graph_1.60.0
## [127] rmarkdown_1.11 fastmatch_1.1-0 htmlTable_1.13.1
## [130] GSEABase_1.44.0 kernlab_0.9-27 shiny_1.2.0
## [133] gtools_3.8.1 modeltools_0.2-22 lifecycle_0.1.0
## [136] nlme_3.1-137 jsonlite_1.6 viridisLite_0.3.0
## [139] pillar_1.3.1 lattice_0.20-38 httr_1.4.0
## [142] DEoptimR_1.0-8 survival_2.43-3 GO.db_3.7.0
## [145] glue_1.3.0 UpSetR_1.3.3 png_0.1-7
## [148] prabclus_2.2-7 iterators_1.0.10 bit_1.1-14
## [151] ggforce_0.1.3 class_7.3-15 stringi_1.2.4
## [154] mixtools_1.1.0 blob_1.1.1 doSNOW_1.0.16
## [157] latticeExtra_0.6-28 caTools_1.17.1.1 memoise_1.1.0
## [160] irlba_2.3.3 ape_5.2